California Housing Prices

housing <- read.csv("housing.csv")

housing$total_bedrooms[is.na(housing$total_bedrooms)] = median(housing$total_bedrooms , na.rm = TRUE)

housing$mean_bedrooms = housing$total_bedrooms/housing$households
housing$mean_rooms = housing$total_rooms/housing$households

drops = c('total_bedrooms', 'total_rooms')

housing = housing[ , !(names(housing) %in% drops)]
colnames(housing)
##  [1] "longitude"          "latitude"           "housing_median_age"
##  [4] "population"         "households"         "median_income"     
##  [7] "median_house_value" "ocean_proximity"    "mean_bedrooms"     
## [10] "mean_rooms"

Kolumny

head(housing)
##   longitude latitude housing_median_age population households median_income
## 1   -122.23    37.88                 41        322        126        8.3252
## 2   -122.22    37.86                 21       2401       1138        8.3014
## 3   -122.24    37.85                 52        496        177        7.2574
## 4   -122.25    37.85                 52        558        219        5.6431
## 5   -122.25    37.85                 52        565        259        3.8462
## 6   -122.25    37.85                 52        413        193        4.0368
##   median_house_value ocean_proximity mean_bedrooms mean_rooms
## 1             452600        NEAR BAY     1.0238095   6.984127
## 2             358500        NEAR BAY     0.9718805   6.238137
## 3             352100        NEAR BAY     1.0734463   8.288136
## 4             341300        NEAR BAY     1.0730594   5.817352
## 5             342200        NEAR BAY     1.0810811   6.281853
## 6             269700        NEAR BAY     1.1036269   4.761658
library(ggplot2)
library(DataExplorer)
library(ggpubr)
library(ggmap)

columns <- names(housing)
columns <- columns[-8]

DataExplorer::plot_density(housing)

#count_plot <- ggplot(housing, aes(ocean_proximity)) +
#  geom_bar(colour="black", fill = "grey") +
#  theme_pubclean()
count_plot <- ggplot(housing, aes(x=ocean_proximity,fill=ocean_proximity)) +
  geom_bar(colour="black") +
  theme_pubclean()

count_plot

library(corrplot)

DataExplorer::plot_correlation(housing)

Największe korelacje między zmiennymi:

library(ggplot2)



ggplot(housing, aes(x=ocean_proximity, y=housing_median_age, fill=ocean_proximity))  +
  geom_violin() + 
  geom_boxplot(width=0.1) +
  #scale_fill_brewer(palette="PuOr") + 
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) ->p1

ggplot(housing, aes(x=ocean_proximity, y=mean_rooms,fill=ocean_proximity)) + 
  geom_violin() + 
  geom_boxplot(width=0.1) +
  #scale_fill_brewer(palette="PuOr") + 
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))->p2

ggplot(housing, aes(x=ocean_proximity, y=mean_bedrooms, fill=ocean_proximity)) + 
  geom_violin() + 
  geom_boxplot(width=0.1) +
  #scale_fill_brewer(palette="PuOr") + 
  theme_minimal()+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))->p3

ggplot(housing, aes(x=ocean_proximity, y=population, fill=ocean_proximity)) + 
  geom_violin() + 
  geom_boxplot(width=0.1) +
  #scale_fill_brewer(palette="PuOr") + 
  theme_minimal()+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))->p4

ggplot(housing, aes(x=ocean_proximity, y=households, fill=ocean_proximity)) + 
  geom_violin() + 
  geom_boxplot(width=0.1) +
  #scale_fill_brewer(palette="PuOr") + 
  theme_minimal()+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))->p5

ggplot(housing, aes(x=ocean_proximity, y=median_income, fill=ocean_proximity)) + 
  geom_violin() + 
  geom_boxplot(width=0.1) +
  #scale_fill_brewer(palette="PuOr") + 
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))->p6

ggplot(housing, aes(x=ocean_proximity, y=median_house_value, fill=ocean_proximity)) + 
  geom_violin() + 
  geom_boxplot(width=0.1) +
  #scale_fill_brewer(palette="PuOr") + 
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))->p7





p1

#p2
#p3
p4

p5

p6

p7

us <- c(left = min(housing$longitude), bottom = min(housing$latitude), 
        right = max(housing$longitude), top = max(housing$latitude))

p <- get_stamenmap(us, zoom = 7, maptype = "toner-lite") %>% ggmap() 
p + geom_point(data = housing, aes(x=longitude, y=latitude, color=median_house_value))+
  ggtitle("Heatmap of median_house_value")+
  theme_light()+
  theme(legend.position="bottom", legend.direction = "vertical") 

p +
  geom_point(data = housing, aes(x=longitude, y=latitude, color=ocean_proximity), alpha=0.5, size=0.5)+
  ggtitle("Ocean_proximity on map")+
  theme_light()+
  theme(legend.position="bottom", legend.direction = "horizontal")

a = dplyr ::filter(housing, ocean_proximity == 'ISLAND')
length(a)
## [1] 10

Podział kategorii zmiennej ocean_proximity i wizualizacja na mapie może wytłumaczyć dziwne wykresy skrzypcowe dla ocean_proximity=ISLAND. Jedynie dziesięć obserwacji ma taką wartość.

ggplot(data = housing, aes(x=households, y=population, color=median_house_value), alpha=0.5, size=0.5)+
  geom_point()+
  ggtitle("Households and population correlation")+
  theme_light()+
  stat_smooth(method = "lm", col = "red")+
  theme(legend.position="bottom", legend.direction = "vertical")
## `geom_smooth()` using formula 'y ~ x'

ggplot(data = housing, aes(x=mean_rooms, y=mean_bedrooms, color=median_house_value), alpha=0.5, size=0.5)+
  geom_point()+
  ggtitle("Mean_room and mean_bedroom correlation")+
  theme_light()+
  stat_smooth(method = "lm", col = "red")+
  theme(legend.position="bottom", legend.direction = "vertical")
## `geom_smooth()` using formula 'y ~ x'

ggplot(data = housing, aes(x=median_income, y=median_house_value), alpha=0.5, size=0.5)+
  geom_point()+
  ggtitle("Median_income and median_house_value correlation")+
  theme_light()+
  stat_smooth(method = "lm", col = "red")+
  theme(legend.position="bottom", legend.direction = "vertical")
## `geom_smooth()` using formula 'y ~ x'

max(housing$median_house_value)
## [1] 500001

Ostatni wykres pokazuje nam, że zmienna celu została obcięta do wartości 500001.